library(readxl)
file_path <- "/home/guido/Downloads/CogED xDiag xVero.xlsx"
dataAD <- read_excel(file_path, sheet = "CogEd AD", range = "B1:R478")
dataCN <- read_excel(file_path, sheet = "CogEd CN", range = "A1:Q669")
dataFTD <- read_excel(file_path, sheet = "CogEd FTD", range = "A1:Q274")
cat("dataAD:", dim(dataAD), "\n")## dataAD: 477 17
## dataCN: 668 17
## dataFTD: 273 17
data <- rbind(dataAD, dataCN, dataFTD)
data$clinical_diagnosis <- as.factor(data$clinical_diagnosis)
data$cog_tmt_a_err <- as.numeric(ifelse(data$cog_tmt_a_err=="completed", 0, data$cog_tmt_a_err))
data$cog_tmt_a_corr <- as.numeric(ifelse(data$cog_tmt_a_corr=="completed", 25, data$cog_tmt_a_corr))
data$cog_tmt_b_err <- as.numeric(ifelse(data$cog_tmt_b_err=="completed", 0, data$cog_tmt_b_err))
data$cog_tmt_b_corr <- as.numeric(ifelse(data$cog_tmt_b_corr=="completed", 25, data$cog_tmt_b_corr))
data$cog_digits_forward_span <- as.numeric(ifelse(data$cog_digits_forward_span=="completed", 8, data$cog_digits_forward_span))
data$cog_digits_backward_span <- as.numeric(ifelse(data$cog_digits_backward_span=="completed", 7, data$cog_digits_backward_span))
data$cog_digits_forward_total <- ifelse(data$cog_digits_forward_total>16, NA, data$cog_digits_forward_total)
data$cog_digits_backward_total <- ifelse(data$cog_digits_backward_total>14, NA, data$cog_digits_backward_total)
data$cog_tmt_a <- ifelse(data$cog_tmt_a>150, NA, data$cog_tmt_a)
data$cog_tmt_b <- ifelse(data$cog_tmt_b>300, NA, data$cog_tmt_b)plot_col_hist <- function(data, title,column) {
hist(data[[column]],
main = title,
xlab = "Años",
ylab = "Frecuencia",
col = "lightblue",
breaks = 20)
}
plot_col_hist(data, "Años de educación","cog_ed")tabla1 <-
tbl_summary(
data = data,
by = clinical_diagnosis, # para comparar grupos
include = c( # 👈 incluimos solo estas variables
cog_ed,
cog_digits_forward_total,
cog_digits_forward_span,
cog_digits_backward_total,
cog_digits_backward_span,
mmse_total,
cog_category_animals,
cog_category_vegetables,
cog_tmt_a,
cog_tmt_a_corr,
cog_tmt_a_err,
cog_tmt_b,
cog_tmt_b_corr,
cog_tmt_b_err
),
missing = "ifany",
missing_text = "No realizado",
label = list(
cog_ed ~ "Años de educación",
cog_digits_forward_total ~ "Digits forward total",
cog_digits_forward_span ~ "Digits forward span",
cog_digits_backward_total ~ "Digits backwards total",
cog_digits_backward_span ~ "Digits backwards span",
mmse_total ~ "Mini mental",
cog_category_animals ~ "Fluidez semántica (animales)",
cog_category_vegetables ~ "Fluidez semántica (vegetales)",
cog_tmt_a ~ "Atención sostenida (tiempo)",
cog_tmt_a_corr ~ "Atención sostenida (correctas)",
cog_tmt_a_err ~ "Atención sostenida (errores)",
cog_tmt_b ~ "Funciones ejecutivas (tiempo)",
cog_tmt_b_corr ~ "Funciones ejecutivas (correctas)",
cog_tmt_b_err ~ "Funciones ejecutivas (errores)"
),
type = list( # forzamos todas como continuas
cog_ed ~ "continuous",
cog_digits_forward_total ~ "continuous",
cog_digits_forward_span ~ "continuous",
cog_digits_backward_total ~ "continuous",
cog_digits_backward_span ~ "continuous",
mmse_total ~ "continuous",
cog_category_animals ~ "continuous",
cog_category_vegetables ~ "continuous",
cog_tmt_a ~ "continuous",
cog_tmt_a_corr ~ "continuous",
cog_tmt_a_err ~ "continuous",
cog_tmt_b ~ "continuous",
cog_tmt_b_corr ~ "continuous",
cog_tmt_b_err ~ "continuous"
),
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})",
all_categorical() ~ "{n} ({p}%)"
)
) %>%
add_overall() %>%
add_p(
test = list(
all_continuous() ~ "kruskal.test",
all_categorical() ~ "chisq.test"
)
) %>%
bold_labels()
tabla1| Characteristic | Overall N = 1,1061 |
AD N = 4091 |
CN N = 4941 |
FTD N = 2031 |
p-value2 |
|---|---|---|---|---|---|
| Años de educación | 13.0 (6.0, 16.0) | 12.0 (8.0, 16.0) | 12.0 (5.0, 17.0) | 14.0 (11.0, 16.0) | <0.001 |
| Digits forward total | 5.00 (4.00, 7.00) | 5.00 (3.00, 6.00) | 5.00 (4.00, 7.00) | 5.00 (3.00, 6.00) | <0.001 |
| No realizado | 10 | 5 | 0 | 5 | |
| Digits forward span | 5.00 (4.00, 6.00) | 5.00 (4.00, 6.00) | 5.00 (4.00, 6.00) | 5.00 (4.00, 6.00) | 0.001 |
| No realizado | 7 | 4 | 0 | 3 | |
| Digits backwards total | 4.00 (3.00, 5.00) | 3.00 (2.00, 4.00) | 4.00 (3.00, 6.00) | 3.00 (2.00, 5.00) | <0.001 |
| No realizado | 25 | 7 | 6 | 12 | |
| Digits backwards span | 3.00 (3.00, 4.00) | 3.00 (2.00, 4.00) | 4.00 (3.00, 5.00) | 3.00 (2.00, 4.00) | <0.001 |
| No realizado | 7 | 4 | 0 | 3 | |
| Mini mental | 24.0 (20.0, 28.0) | 21.0 (18.0, 24.0) | 28.0 (24.0, 29.0) | 23.0 (17.0, 26.0) | <0.001 |
| No realizado | 3 | 2 | 1 | 0 | |
| Fluidez semántica (animales) | 14 (10, 20) | 11 (8, 14) | 19 (15, 23) | 9 (5, 15) | <0.001 |
| No realizado | 6 | 3 | 0 | 3 | |
| Fluidez semántica (vegetales) | 11 (6, 15) | 7 (4, 10) | 15 (12, 19) | 6 (3, 10) | <0.001 |
| No realizado | 19 | 8 | 1 | 10 | |
| Atención sostenida (tiempo) | 65 (40, 107) | 88 (56, 147) | 49 (33, 79) | 76 (46, 122) | <0.001 |
| No realizado | 64 | 36 | 4 | 24 | |
| Atención sostenida (correctas) | 24.0 (24.0, 24.0) | 24.0 (24.0, 24.0) | 24.0 (24.0, 24.0) | 24.0 (24.0, 24.0) | <0.001 |
| No realizado | 26 | 17 | 0 | 9 | |
| Atención sostenida (errores) | 0.00 (0.00, 0.00) | 0.00 (0.00, 0.00) | 0.00 (0.00, 0.00) | 0.00 (0.00, 1.00) | 0.033 |
| No realizado | 26 | 17 | 0 | 9 | |
| Funciones ejecutivas (tiempo) | 168 (92, 300) | 300 (164, 300) | 110 (71, 208) | 195 (126, 300) | <0.001 |
| No realizado | 285 | 136 | 83 | 66 | |
| Funciones ejecutivas (correctas) | 24 (23, 24) | 24 (13, 25) | 24 (24, 24) | 24 (22, 25) | <0.001 |
| No realizado | 53 | 35 | 1 | 17 | |
| Funciones ejecutivas (errores) | 0.00 (0.00, 2.00) | 1.00 (0.00, 3.00) | 0.00 (0.00, 1.00) | 0.00 (0.00, 2.00) | <0.001 |
| No realizado | 54 | 35 | 2 | 17 | |
| 1 Median (Q1, Q3) | |||||
| 2 Kruskal-Wallis rank sum test | |||||
data2 <- data %>%
filter(clinical_diagnosis != "CN")
data2$clinical_diagnosis <- droplevels(data2$clinical_diagnosis)
tabla1 <-
tbl_summary(
data = data2,
by = clinical_diagnosis, # para comparar grupos
include = c( # 👈 incluimos solo estas variables
cog_ed,
cog_digits_forward_total,
cog_digits_forward_span,
cog_digits_backward_total,
cog_digits_backward_span,
mmse_total,
cog_category_animals,
cog_category_vegetables,
cog_tmt_a,
cog_tmt_a_corr,
cog_tmt_a_err,
cog_tmt_b,
cog_tmt_b_corr,
cog_tmt_b_err
),
missing = "ifany",
missing_text = "No realizado",
label = list(
cog_ed ~ "Años de educación",
cog_digits_forward_total ~ "Digits forward total",
cog_digits_forward_span ~ "Digits forward span",
cog_digits_backward_total ~ "Digits backwards total",
cog_digits_backward_span ~ "Digits backwards span",
mmse_total ~ "Mini mental",
cog_category_animals ~ "Fluidez semántica (animales)",
cog_category_vegetables ~ "Fluidez semántica (vegetales)",
cog_tmt_a ~ "Atención sostenida (tiempo)",
cog_tmt_a_corr ~ "Atención sostenida (correctas)",
cog_tmt_a_err ~ "Atención sostenida (errores)",
cog_tmt_b ~ "Funciones ejecutivas (tiempo)",
cog_tmt_b_corr ~ "Funciones ejecutivas (correctas)",
cog_tmt_b_err ~ "Funciones ejecutivas (errores)"
),
type = list( # forzamos todas como continuas
cog_ed ~ "continuous",
cog_digits_forward_total ~ "continuous",
cog_digits_forward_span ~ "continuous",
cog_digits_backward_total ~ "continuous",
cog_digits_backward_span ~ "continuous",
mmse_total ~ "continuous",
cog_category_animals ~ "continuous",
cog_category_vegetables ~ "continuous",
cog_tmt_a ~ "continuous",
cog_tmt_a_corr ~ "continuous",
cog_tmt_a_err ~ "continuous",
cog_tmt_b ~ "continuous",
cog_tmt_b_corr ~ "continuous",
cog_tmt_b_err ~ "continuous"
),
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})",
all_categorical() ~ "{n} ({p}%)"
)
) %>%
add_overall() %>%
add_p(
test = list(
all_continuous() ~ "wilcox.test",
all_categorical() ~ "chisq.test"
)
) %>%
bold_labels()
tabla1| Characteristic | Overall N = 6121 |
AD N = 4091 |
FTD N = 2031 |
p-value2 |
|---|---|---|---|---|
| Años de educación | 13.5 (10.0, 16.0) | 12.0 (8.0, 16.0) | 14.0 (11.0, 16.0) | 0.006 |
| Digits forward total | 5.00 (3.00, 6.00) | 5.00 (3.00, 6.00) | 5.00 (3.00, 6.00) | >0.9 |
| No realizado | 10 | 5 | 5 | |
| Digits forward span | 5.00 (4.00, 6.00) | 5.00 (4.00, 6.00) | 5.00 (4.00, 6.00) | 0.6 |
| No realizado | 7 | 4 | 3 | |
| Digits backwards total | 3.00 (2.00, 4.00) | 3.00 (2.00, 4.00) | 3.00 (2.00, 5.00) | >0.9 |
| No realizado | 19 | 7 | 12 | |
| Digits backwards span | 3.00 (2.00, 4.00) | 3.00 (2.00, 4.00) | 3.00 (2.00, 4.00) | 0.3 |
| No realizado | 7 | 4 | 3 | |
| Mini mental | 21.0 (18.0, 25.0) | 21.0 (18.0, 24.0) | 23.0 (17.0, 26.0) | 0.004 |
| No realizado | 2 | 2 | 0 | |
| Fluidez semántica (animales) | 10.0 (7.0, 14.0) | 11.0 (8.0, 14.0) | 9.0 (5.0, 15.0) | 0.005 |
| No realizado | 6 | 3 | 3 | |
| Fluidez semántica (vegetales) | 6.0 (4.0, 10.0) | 7.0 (4.0, 10.0) | 6.0 (3.0, 10.0) | 0.066 |
| No realizado | 18 | 8 | 10 | |
| Atención sostenida (tiempo) | 84 (51, 137) | 88 (56, 147) | 76 (46, 122) | 0.008 |
| No realizado | 60 | 36 | 24 | |
| Atención sostenida (correctas) | 24.0 (24.0, 24.0) | 24.0 (24.0, 24.0) | 24.0 (24.0, 24.0) | 0.3 |
| No realizado | 26 | 17 | 9 | |
| Atención sostenida (errores) | 0.00 (0.00, 1.00) | 0.00 (0.00, 0.00) | 0.00 (0.00, 1.00) | 0.5 |
| No realizado | 26 | 17 | 9 | |
| Funciones ejecutivas (tiempo) | 269 (147, 300) | 300 (164, 300) | 195 (126, 300) | <0.001 |
| No realizado | 202 | 136 | 66 | |
| Funciones ejecutivas (correctas) | 24 (14, 25) | 24 (13, 25) | 24 (22, 25) | 0.031 |
| No realizado | 52 | 35 | 17 | |
| Funciones ejecutivas (errores) | 1.00 (0.00, 3.00) | 1.00 (0.00, 3.00) | 0.00 (0.00, 2.00) | 0.030 |
| No realizado | 52 | 35 | 17 | |
| 1 Median (Q1, Q3) | ||||
| 2 Wilcoxon rank sum test | ||||
plot_corr <- function(df, vars_to_keep, label_map, title = NULL) {
# Filter only numeric columns listed in vars_to_keep
data_num <- df %>%
select(all_of(vars_to_keep))
# Compute correlation matrix
mat_cor <- cor(data_num, use = "pairwise.complete.obs", method = "pearson")
# Apply labels
mat_cor_labeled <- mat_cor
colnames(mat_cor_labeled) <- label_map[colnames(mat_cor_labeled)]
rownames(mat_cor_labeled) <- label_map[rownames(mat_cor_labeled)]
# Palette
paleta <- colorRampPalette(RColorBrewer::brewer.pal(11, "RdBu"))(100)
# Plot
corrplot(mat_cor_labeled,
method = "color",
type = "upper",
order = "hclust",
tl.cex = 0.8,
col = paleta,
title = title,
mar = c(0,0,2,0))
}data_num <- data %>%
select(where(is.numeric))
mat_cor <- cor(data_num, use = "pairwise.complete.obs", method = "pearson")
label_map <- c(
mmse_total = "MMSE",
cog_digits_forward_total = "DF total",
cog_digits_backward_total = "DB total",
cog_digits_forward_span = "DF span",
cog_digits_backward_span = "DB span",
cog_category_animals = "Fluidez animales",
cog_category_vegetables = "Fluidez vegetales",
cog_tmt_a = "TMT-A (tiempo)",
cog_tmt_a_corr = "TMT-A (corr.)",
cog_tmt_a_err = "TMT-A (errores)",
cog_tmt_b = "TMT-B (tiempo)",
cog_tmt_b_corr = "TMT-B (corr.)",
cog_tmt_b_err = "TMT-B (errores)"
)
vars_to_keep <- names(label_map)
plot_corr(data, vars_to_keep, label_map, title = "Correlaciones")data_num <- data %>%
select(where(is.numeric))
mat_cor <- cor(data_num, use = "pairwise.complete.obs", method = "pearson")
label_map <- c(
mmse_total = "Desempeño Total (MMSE)",
cog_digits_forward_total = "Digits Forward",
cog_digits_backward_total = "Digits Backward",
cog_category_animals = "Fluidez animales",
cog_category_vegetables = "Fluidez vegetales",
cog_tmt_a = "Atención sostenida",
cog_tmt_b = "Funciones Ejecutivas"
)
vars_to_keep <- names(label_map)
plot_corr(data, vars_to_keep, label_map, title = "Correlaciones")label_map <- c(
mmse_total = "Desempeño Total (MMSE)",
cog_digits_forward_total = "Digits Forward",
cog_digits_backward_total = "Digits Backward",
cog_category_animals = "Fluidez animales",
cog_category_vegetables = "Fluidez vegetales",
cog_tmt_a = "Atención sostenida",
cog_tmt_b = "Funciones Ejecutivas"
)
vars_to_keep <- names(label_map)
plot_corr(dataAD, vars_to_keep, label_map, title = "Correlaciones - AD")label_map <- c(
mmse_total = "Desempeño Total (MMSE)",
cog_digits_forward_total = "Memoria",
cog_category_animals = "Lenguaje",
cog_tmt_b = "Funciones Ejecutivas - Atención"
)
vars_to_keep <- names(label_map)
plot_corr(dataAD, vars_to_keep, label_map, title = "Correlaciones - AD")# Columnas y labels
label_map <- c(
mmse_total = "Desempeño Total (MMSE)",
cog_digits_forward_total = "Memoria",
cog_category_animals = "Lenguaje",
cog_tmt_a = "Atención",
cog_tmt_b = "Funciones Ejecutivas - Atención"
)
vars_to_keep <- names(label_map)
# Graficar correlaciones originales
plot_corr(dataAD, vars_to_keep, label_map, title = "Correlaciones - AD (original)")# Número de permutaciones a mostrar
n_perm_show <- 5
set.seed(123) # para reproducibilidad
# Elegimos la columna a permutar
column_to_permute <- "cog_tmt_a"
# Generamos 5 permutaciones diferentes
for (i in 1:n_perm_show) {
data_perm <- dataAD
data_perm[[column_to_permute]] <- sample(data_perm[[column_to_permute]])
# Graficar correlación de esta permutación
plot_corr(data_perm, vars_to_keep, label_map,
title = paste("Correlaciones - AD (perm", i, ")"))
}# Columnas y labels
label_map <- c(
mmse_total = "Desempeño Total (MMSE)",
cog_digits_forward_total = "Memoria",
cog_category_animals = "Lenguaje",
cog_tmt_a = "Atención",
cog_tmt_b = "Funciones Ejecutivas - Atención"
)
vars_to_keep <- names(label_map)
# Correlación original
plot_corr(dataAD, vars_to_keep, label_map,
title = "Correlaciones - AD (original)")# Número de permutaciones a mostrar por par
n_perm_show <- 5
set.seed(123)
# Todas las combinaciones de pares
pairs <- combn(vars_to_keep, 2, simplify = FALSE)
# Loop por cada par de columnas
for (pair in pairs) {
col_perm <- pair[1]
col_ref <- pair[2]
for (i in seq_len(n_perm_show)) {
data_perm <- dataAD
data_perm[[col_perm]] <- sample(data_perm[[col_perm]])
plot_corr(
data_perm,
vars_to_keep,
label_map,
title = paste0(
"AD | permutando: ", label_map[col_perm],
" vs ", label_map[col_ref],
" (perm ", i, ")"
)
)
}
}#Limpio la base de las variables de digits span
#todos los datos
data <- data %>%
select(-cog_digits_backward_span, -cog_digits_forward_span, -tests_no_hechos)
#solamente AD y FTD
data2 <- data %>%
filter(clinical_diagnosis != "CN")
data2$clinical_diagnosis <- droplevels(data2$clinical_diagnosis)#Cambio los nombres de las variables
label_map <- c(
cog_ed = "Nivel educativo",
cog_digits_forward_total = "Memoria inmediata",
cog_digits_backward_total= "Memoria de trabajo",
mmse_total = "MMSE",
cog_category_animals = "Fluidez (animales)",
cog_category_vegetables = "Fluidez (vegetales)",
cog_tmt_a = "Atención",
cog_tmt_b = "Funciones ejecutivas",
cog_tmt_a_err = "Errores de atención",
cog_tmt_b_err = "Errores ejecutivos",
cog_tmt_a_corr = "Correctas atención",
cog_tmt_b_corr = "Correctas ejecutivas"
)
vars <- label_map[vars]
vars_original <- names(label_map)
data <- data%>%
rename_with(
~ ifelse(.x %in% names(label_map), label_map[.x], .x),
.cols = all_of(vars_original)
)
data2<- data2 %>%
rename_with(
~ ifelse(.x %in% names(label_map), label_map[.x], .x),
.cols = all_of(vars_original)
)#Test de correlaciones luego de las permutaciones. Permuta en cada par de variables. N=10000. Luego ordena según cuán desviado está de la media el delta de correlación (o sea cuánto)
perm_corr_test <- function(data, x, y, n_perm = 10000, method = "spearman") {
r_obs <- cor(data[[x]], data[[y]], method = method, use = "complete.obs")
r_perm <- replicate(n_perm, {
cor(
sample(data[[x]]),
data[[y]],
method = method,
use = "complete.obs"
)
})
tibble(
var_x = x,
var_y = y,
r_obs = r_obs,
r_perm_mean = mean(r_perm),
r_perm_sd = sd(r_perm),
delta_r = r_obs - mean(r_perm),
p_value = (sum(abs(r_perm) >= abs(r_obs)) + 1) / (n_perm + 1),
z_perm = (r_obs - mean(r_perm)) / sd(r_perm)
)
}
library(purrr)
results_all <- map_dfr(
combn(vars, 2, simplify = FALSE),
~ perm_corr_test(data, .x[1], .x[2], n_perm = 10000)
)
results_all <- results_all |>
dplyr::mutate(
p_adj = p.adjust(p_value, method = "BH")
)
results_all |>
arrange(desc(abs(z_perm)))#Ahora esto lo que arma es un score de fuerza de colinealidad sumando los valores absolutos de cada delta r para cada permutación. Con lo cual es un valor más general sobre colinealidad de esa variable.
colinearity_score <- function(data, x, vars, n_perm = 10000, method = "spearman") {
others <- setdiff(vars, x)
deltas <- sapply(others, function(y) {
r_obs <- cor(data[[x]], data[[y]], method = method, use = "complete.obs")
r_perm <- replicate(
n_perm,
cor(sample(data[[x]]), data[[y]], method = method, use = "complete.obs")
)
r_obs - mean(r_perm)
})
tibble::tibble(
var = x,
colinearity_strength = sum(abs(deltas)), # fuerza total
mean_delta = mean(abs(deltas)), # promedio
max_delta = max(abs(deltas)), # peor caso
n_links = length(deltas) # cuántas variables compara
)
}
library(purrr)
library(dplyr)
colinearity_tbl <- map_dfr(
vars,
~ colinearity_score(
data = data,
x = .x,
vars = vars,
n_perm = 10000 # o 5000 / 10000 si querés
)
)
colinearity_tbl %>%
arrange(desc(colinearity_strength))#Ahora hago lo mismo pero solamente con la data AD + FTD
results_all <- map_dfr(
combn(vars, 2, simplify = FALSE),
~ perm_corr_test(data2, .x[1], .x[2], n_perm = 10000)
)
results_all <- results_all |>
dplyr::mutate(
p_adj = p.adjust(p_value, method = "BH")
)
results_all |>
arrange(desc(abs(z_perm)))colinearity_tbl <- map_dfr(
vars,
~ colinearity_score(
data = data2,
x = .x,
vars = vars,
n_perm = 10000 # o 5000 / 10000 si querés
)
)
colinearity_tbl %>%
arrange(desc(colinearity_strength))#Modelo logistico con todas las variables para ver cuales dan "significativas"
modelo_todas <- glm(clinical_diagnosis~., data2, family=binomial)
summary(modelo_todas)##
## Call:
## glm(formula = clinical_diagnosis ~ ., family = binomial, data = data2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0489005 1.4691475 -1.395 0.163131
## `Nivel educativo` 0.0598843 0.0295263 2.028 0.042543 *
## `Memoria inmediata` 0.0013200 0.0708769 0.019 0.985141
## `Memoria de trabajo` -0.0734425 0.0868618 -0.846 0.397826
## MMSE 0.1427961 0.0369396 3.866 0.000111 ***
## `Fluidez (animales)` -0.0904753 0.0280834 -3.222 0.001274 **
## `Fluidez (vegetales)` -0.0354267 0.0314615 -1.126 0.260151
## Atención 0.0009477 0.0048826 0.194 0.846103
## `Errores de atención` 0.3527964 0.1639014 2.152 0.031359 *
## `Correctas atención` -0.0354815 0.0386131 -0.919 0.358150
## `Funciones ejecutivas` -0.0057870 0.0020984 -2.758 0.005818 **
## `Errores ejecutivos` 0.0609458 0.0419743 1.452 0.146507
## `Correctas ejecutivas` 0.0322205 0.0227271 1.418 0.156274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 502.21 on 398 degrees of freedom
## Residual deviance: 442.07 on 386 degrees of freedom
## (213 observations deleted due to missingness)
## AIC: 468.07
##
## Number of Fisher Scoring iterations: 4
## `Nivel educativo` `Memoria inmediata` `Memoria de trabajo`
## 1.206504 1.536875 1.769802
## MMSE `Fluidez (animales)` `Fluidez (vegetales)`
## 1.508477 1.913680 1.717200
## Atención `Errores de atención` `Correctas atención`
## 2.669761 1.321394 1.447586
## `Funciones ejecutivas` `Errores ejecutivos` `Correctas ejecutivas`
## 2.450613 1.400061 2.314099
#Hago modelo logistico con todas las variables y pero evaluando el AUC por cross validation
library(dplyr)
library(rsample)
library(purrr)
library(pROC)
#Funcion para calcular AUC de regresión logística por CV
cv_auc_logistic <- function(data, outcome, predictors, v = 10) {
df <- data[, c(outcome, predictors)]
formula <- as.formula(
paste(outcome, "~", paste(sprintf("`%s`", predictors), collapse = " + "))
)
folds <- vfold_cv(
df,
v = v,
strata = !!sym(outcome)
)
aucs <- purrr::map_dbl(folds$splits, function(split) {
train <- rsample::analysis(split)
test <- rsample::assessment(split)
fit <- glm(
formula,
data = train,
family = binomial()
)
prob <- predict(fit, test, type = "response")
pROC::roc(
response = test[[outcome]],
predictor = prob,
quiet = TRUE
)$auc
})
tibble::tibble(
auc_mean = mean(aucs),
auc_sd = sd(aucs),
auc_min = min(aucs),
auc_max = max(aucs)
)
}#Calculo con todas las variables, AUC medio de 0.67
set.seed(123)
auc_full <- cv_auc_logistic(
data = data2,
outcome = "clinical_diagnosis",
predictors = vars,
v = 10
)
auc_full#Grafico ese AUC "out of fold"
library(dplyr)
library(rsample)
library(purrr)
library(pROC)
cv_oof_pred <- function(data, outcome, predictors, v = 10, seed = 123) {
# 🛡 1) limpiar predictores
predictors <- predictors[!is.na(predictors)]
predictors <- predictors[predictors != ""]
# 🛡 2) backticks para nombres con espacios / tildes
predictors_q <- sprintf("`%s`", predictors)
set.seed(seed)
df <- data[, c(outcome, predictors)]
formula <- as.formula(
paste(outcome, "~", paste(predictors_q, collapse = " + "))
)
folds <- rsample::vfold_cv(df, v = v, strata = !!rlang::sym(outcome))
purrr::map_dfr(folds$splits, function(split) {
train <- rsample::analysis(split)
test <- rsample::assessment(split)
fit <- glm(
formula,
data = train,
family = binomial()
)
prob <- predict(fit, test, type = "response")
tibble::tibble(
y = test[[outcome]],
prob = as.numeric(prob)
)
})
}
oof <- cv_oof_pred(
data2,
"clinical_diagnosis",
vars,
v = 10,
seed = 123
)
roc_oof <- roc(
response = oof$y,
predictor = oof$prob,
quiet = TRUE
)
auc_val <- auc(roc_oof)
roc_df <- tibble(
fpr = 1 - roc_oof$specificities,
tpr = roc_oof$sensitivities
)
library(ggplot2)
ggplot(roc_df, aes(x = fpr, y = tpr)) +
geom_line(linewidth = 1.3, color = "#1f77b4") +
geom_abline(
intercept = 0, slope = 1,
linetype = "dashed",
color = "grey60"
) +
coord_equal() +
labs(
title = "ROC sin descartar variables",
subtitle = paste0("AUC = ", round(auc_val, 3)),
x = "1 − Especificidad",
y = "Sensibilidad"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
## Descarte de variables por “score” de colinealidad
#Tenia este "orden" de "Fortaleza de colinealidad". Voy a ir sacando de a una y a analizar el AUC
colinearity_tbl %>%
arrange(desc(colinearity_strength))#Voy sacando de a una y calculando, sumo todo a una tabla
vars_reduced <- c("Atención", "Memoria de trabajo", "MMSE", "Fluidez (animales)", "Memoria inmediata", "Fluidez (vegetales)","Errores ejecutivos", "Correctas ejecutivas", "Correctas atención", "Errores de atención", "Nivel educativo")
set.seed(123)
auc_reduced <- cv_auc_logistic(
data = data2,
outcome = "clinical_diagnosis",
predictors = vars_reduced,
v = 10
)
auc_reducedvars_reduced2<- c( "Memoria de trabajo", "MMSE", "Fluidez (animales)", "Memoria inmediata", "Fluidez (vegetales)","Errores ejecutivos", "Correctas ejecutivas", "Correctas atención", "Errores de atención", "Nivel educativo")
set.seed(123)
auc_reduced2 <- cv_auc_logistic(
data = data2,
outcome = "clinical_diagnosis",
predictors = vars_reduced2,
v = 10
)
auc_reduced2vars_reduced3 <- c( "MMSE", "Fluidez (animales)", "Memoria inmediata", "Fluidez (vegetales)","Errores ejecutivos", "Correctas ejecutivas", "Correctas atención", "Errores de atención", "Nivel educativo")
set.seed(123)
auc_reduced3 <- cv_auc_logistic(
data = data2,
outcome = "clinical_diagnosis",
predictors = vars_reduced3,
v = 10
)
auc_reduced3vars_reduced4 <- c("Fluidez (animales)", "Memoria inmediata", "Fluidez (vegetales)","Errores ejecutivos", "Correctas ejecutivas", "Correctas atención", "Errores de atención", "Nivel educativo")
set.seed(123)
auc_reduced4 <- cv_auc_logistic(
data = data2,
outcome = "clinical_diagnosis",
predictors = vars_reduced4,
v = 10
)
auc_reduced4#A medida que sacamos variables por colinealidad, PERDEMOS AUC
tablaauc <- rbind(auc_reduced, auc_reduced2, auc_reduced3, auc_reduced4)
rownames(tablaauc) <- c("Sacando 1 variable", "Sacando 2 variables", "Sacando 3 variables", "Sacando 4 variables")
tablaauc#Entonces probamos hacer Elastic Net (Lasso + Ridge, o cada una de ellas) para manejar colinealidad y seleccionar variables
library(glmnet)
library(pROC)
library(rsample)
library(purrr)
library(dplyr)
cv_auc_elastic <- function(data, outcome, predictors, alpha = 0.5, v = 10, seed = 123) {
predictors <- predictors[!is.na(predictors)] # 🛡 evita NA
predictors_q <- sprintf("`%s`", predictors) # 🛡 nombres con espacios
set.seed(seed)
folds <- rsample::vfold_cv(data, v = v, strata = !!rlang::sym(outcome))
aucs <- purrr::map_dbl(seq_along(folds$splits), function(i) {
set.seed(seed + i)
split <- folds$splits[[i]]
train <- rsample::analysis(split)
test <- rsample::assessment(split)
X_train <- model.matrix(
as.formula(paste(outcome, "~", paste(predictors_q, collapse = " + "))),
train
)[,-1]
X_test <- model.matrix(
as.formula(paste(outcome, "~", paste(predictors_q, collapse = " + "))),
test
)[,-1]
y_train <- train[[outcome]]
y_test <- test[[outcome]]
cv_fit <- glmnet::cv.glmnet(
X_train, y_train,
family = "binomial",
alpha = alpha,
nfolds = 5
)
prob <- predict(cv_fit, X_test, s = "lambda.min", type = "response")
pROC::roc(y_test, as.vector(prob), quiet = TRUE)$auc
})
tibble::tibble(
model = paste0("Elastic Net (alpha=", alpha, ")"),
auc_mean = mean(aucs),
auc_sd = sd(aucs)
)
}data2_clean <- data2 %>% drop_na()
#Ridge es el mejor, porque regulariza y maneja la colinealidad pero no elimina variables
auc_enet_ridge <- cv_auc_elastic(
data = data2_clean,
outcome = "clinical_diagnosis",
predictors = vars,
alpha = 0,
v = 10
)
auc_enet_ridgeauc_enet <- cv_auc_elastic(
data = data2_clean,
outcome = "clinical_diagnosis",
predictors = vars,
alpha = 0.5,
v = 10
)
auc_enetauc_enet_lasso <- cv_auc_elastic(
data = data2_clean,
outcome = "clinical_diagnosis",
predictors = vars,
alpha = 1,
v = 10
)
auc_enet_lasso#Esta calcula tambien los coeficientes para ver qué selecciona Lasso
cv_auc_elastic_with_coefs <- function(
data, outcome, predictors,
alpha = 1, v = 10, seed = 123
) {
# 🛡 1) limpiar NAs y vacíos
predictors <- predictors[!is.na(predictors)]
predictors <- predictors[predictors != ""]
# 🛡 2) backticks para nombres con espacios/tildes
predictors_q <- sprintf("`%s`", predictors)
set.seed(seed)
folds <- rsample::vfold_cv(data, v = v, strata = !!rlang::sym(outcome))
aucs <- numeric(length(folds$splits))
selected_vars <- vector("list", length(folds$splits))
for (i in seq_along(folds$splits)) {
set.seed(seed + i)
split <- folds$splits[[i]]
train <- rsample::analysis(split)
test <- rsample::assessment(split)
fml <- as.formula(paste(outcome, "~", paste(predictors_q, collapse = " + ")))
X_train <- model.matrix(fml, train)[, -1, drop = FALSE]
X_test <- model.matrix(fml, test)[, -1, drop = FALSE]
y_train <- train[[outcome]]
y_test <- test[[outcome]]
cv_fit <- glmnet::cv.glmnet(
X_train, y_train,
family = "binomial",
alpha = alpha,
nfolds = 5
)
# --- AUC ---
prob <- predict(cv_fit, X_test, s = "lambda.min", type = "response")
aucs[i] <- pROC::roc(y_test, as.vector(prob), quiet = TRUE)$auc
# --- VARIABLES SELECCIONADAS ---
coefs <- coef(cv_fit, s = "lambda.min") # ✅ método S3 correcto
sel <- rownames(coefs)[as.numeric(coefs) != 0]
sel <- setdiff(sel, "(Intercept)")
selected_vars[[i]] <- sel
}
tibble::tibble(
auc_mean = mean(aucs),
auc_sd = sd(aucs),
selected_vars = list(unlist(selected_vars))
)
}#Aca se ve la importancia Lasso de cada variable
library(dplyr)
lasso_importance <- tibble(var = res_lasso$selected_vars[[1]]) %>%
count(var) %>%
mutate(freq = n / 10) %>% # 10 folds
arrange(desc(freq))
lasso_importance#Ahora calculo la importancia por permutación de cada variable: de a una variable, permuto y veo cuánto cambia el AUC
perm_importance_cv <- function(
data, outcome, predictors,
alpha = 0, v = 10, n_perm = 20, seed = 123
) {
# 🛡 1) limpiar predictores
predictors <- predictors[!is.na(predictors)]
predictors <- predictors[predictors != ""]
# 🛡 2) backticks para nombres con espacios / tildes
predictors_q <- sprintf("`%s`", predictors)
set.seed(seed)
folds <- rsample::vfold_cv(data, v = v, strata = !!rlang::sym(outcome))
all_imp <- list()
for (i in seq_along(folds$splits)) {
set.seed(seed + i)
split <- folds$splits[[i]]
train <- rsample::analysis(split)
test <- rsample::assessment(split)
fml <- as.formula(
paste(outcome, "~", paste(predictors_q, collapse = " + "))
)
X_train <- model.matrix(fml, train)[, -1, drop = FALSE]
X_test <- model.matrix(fml, test)[, -1, drop = FALSE]
y_train <- train[[outcome]]
y_test <- test[[outcome]]
# --- modelo base ---
cv_fit <- glmnet::cv.glmnet(
X_train, y_train,
family = "binomial",
alpha = alpha,
nfolds = 5
)
prob_base <- predict(
cv_fit,
X_test,
s = "lambda.min",
type = "response"
)
auc_base <- pROC::roc(
y_test,
as.vector(prob_base),
quiet = TRUE
)$auc
# --- permutaciones ---
for (var in colnames(X_test)) {
auc_perm <- replicate(n_perm, {
Xp <- X_test
Xp[, var] <- sample(Xp[, var])
prob <- predict(
cv_fit,
Xp,
s = "lambda.min",
type = "response"
)
pROC::roc(
y_test,
as.vector(prob),
quiet = TRUE
)$auc
})
all_imp[[length(all_imp) + 1]] <- tibble::tibble(
var = var,
delta_auc = as.numeric(auc_base - mean(auc_perm))
)
}
}
dplyr::bind_rows(all_imp) %>%
dplyr::group_by(var) %>%
dplyr::summarise(
mean_delta_auc = mean(delta_auc),
.groups = "drop"
) %>%
dplyr::arrange(desc(mean_delta_auc))
}#Se ve que la importancia por permutación (o sea importancia de cada variable en la capacidad predictiva del modelo) es diferente que la importancia de las variables segun Lasso
set.seed(123)
perm_imp_ridge <- perm_importance_cv(
data = data2_clean,
outcome = "clinical_diagnosis",
predictors = vars,
alpha = 0, # 🔑 Ridge
v = 10, # CV externo
n_perm = 20, # permutaciones por variable por fold
seed = 123
)
perm_imp_ridgelibrary(dplyr)
# Permutation importance (Ridge)
perm_tbl <- perm_imp_ridge %>%
rename(
var = var,
delta_auc = mean_delta_auc
)
# LASSO estabilidad
lasso_tbl <- lasso_importance %>%
rename(
var = var,
lasso_freq = freq
)
# Unir todo
importance_tbl <- perm_tbl %>%
left_join(lasso_tbl, by = "var") %>%
mutate(
lasso_freq = ifelse(is.na(lasso_freq), 0, lasso_freq)
)
importance_tbl#Si bien lasso y la importancia por permutaciones coinciden en varias variables, en otras no tanto. Veamos el grafico
library(ggplot2)
library(ggrepel)
ggplot(
importance_tbl,
aes(x = lasso_freq, y = delta_auc, label = var)
) +
geom_point(size = 3, alpha = 0.8) +
geom_text_repel(
size = 4,
max.overlaps = Inf,
box.padding = 0.5,
point.padding = 0.3,
min.segment.length = 0
) +
geom_vline(xintercept = 0.5, linetype = 3, color = "grey50") +
geom_hline(yintercept = median(importance_tbl$delta_auc, na.rm = TRUE),
linetype = 3, color = "grey50") +
scale_x_continuous(
limits = c(0, 1.05),
breaks = c(0, 0.25, 0.5, 0.75, 1)
) +
labs(
x = "Estabilidad LASSO (frecuencia de selección)",
y = "Importancia predictiva (ΔAUC permutado)",
title = "Importancia de variables: selección vs contribución predictiva",
subtitle = "Comparación entre estabilidad bajo LASSO y pérdida de AUC al permutar"
) +
theme_minimal(base_size = 14)#Entonces tengo: Variables imprescindibles, arriba a la derecha; variables prescindibles en todos los sentidos, abajo a la izquierda
#Pruebo sacar las variables prescindibles por AUC y vuelvo a calcular AUC: MEJORA un poco! a 0.696
vars2 <- c("MMSE", "Fluidez (animales)", "Funciones ejecutivas", "Nivel educativo")
auc_enet_ridge <- cv_auc_elastic(
data = data2_clean,
outcome = "clinical_diagnosis",
predictors = vars2,
alpha = 0,
v = 10
)
auc_enet_ridge#Grafico curva ROC
oof <- cv_oof_pred(
data2_clean,
"clinical_diagnosis",
vars2,
v = 10,
seed = 123
)
roc_oof <- roc(
response = oof$y,
predictor = oof$prob,
quiet = TRUE
)
auc_val <- auc(roc_oof)
roc_df <- tibble(
fpr = 1 - roc_oof$specificities,
tpr = roc_oof$sensitivities
)
library(ggplot2)
ggplot(roc_df, aes(x = fpr, y = tpr)) +
geom_line(linewidth = 1.3, color = "#1f77b4") +
geom_abline(
intercept = 0, slope = 1,
linetype = "dashed",
color = "grey60"
) +
coord_equal() +
labs(
title = "ROC descartando variables",
subtitle = paste0("AUC = ", round(auc_val, 3)),
x = "1 − Especificidad",
y = "Sensibilidad"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)